0.1 Introduction:

This is our final group assignment for the Data Visualization and Storytelling course at LBS (MAM 2024).

In a world brimming with diverse cultures, economies, and environments, the quest for happiness remains a universal endeavor. Our group’s project embarks on an exploratory journey through the World Happiness Reports from 2015 to 2022. These reports, an amalgam of data, surveys, and research, offer a unique lens to view the changing landscapes of global happiness.

Our github repo is located here: https://github.com/Madikan/am10-mam2024-world-happiness

The database we are using is the World Happiness Report from 2015 to 2022, and can be found here: https://www.kaggle.com/datasets/mayzannilarthein44/world-happiness-report-2015-to-2022/

We also used Global Country Information Dataset 2023 to draw additional insights: https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023

  1. Data Preparation:
# Import data and clean names
whr2015_2022 <- read.csv(here::here("data/world-happiness-report-2015-2022-cleaned.csv"),
                     stringsAsFactors = FALSE) %>%
  janitor::clean_names() 

# Remove the index column
whr2015_2022 <- select(whr2015_2022, -x) 

# Replace commas with dots and remove asterisks
whr2015_2022 <- data.frame(lapply(whr2015_2022, function(x) gsub(",", ".", x)))
whr2015_2022 <- data.frame(lapply(whr2015_2022, function(x) gsub("\\*", "", x)))

# Convert columns to numeric
columns_to_convert <- c("happiness_rank", "happiness_score", "economy_gdp_per_capita", "family_social_support", "health_life_expectancy", "freedom", "trust_government_corruption", "generosity", "year")
whr2015_2022[columns_to_convert] <- lapply(whr2015_2022[columns_to_convert], as.numeric)

# Mapping for inconsistent country names
country_name_mapping <- c(
  "Taiwan Province of China" = "Taiwan",
  "Hong Kong S.A.R. of China" = "Hong Kong",
  "Hong Kong S.A.R., China" = "Hong Kong",
  "Hong Kong S.A.R.. China" = "Hong Kong",
  "Czechia" = "Czech Republic",
  "North Macedonia" = "Macedonia", 
  "Trinidad & Tobago" = "Trinidad and Tobago", 
  "North Cyprus" = "Northern Cyprus",
  "Somaliland region" = "Somalia",
  "Somaliland Region" = "Somalia", 
  "Palestinian Territories" = "Palestine", 
  "Eswatini. Kingdom of" = "Swaziland")

# Apply the mapping to consolidate country names
whr2015_2022$country <- mapvalues(whr2015_2022$country, from = names(country_name_mapping), to = country_name_mapping)

# Mapping for inconsistent region names
region_name_mapping <- c(
  "Eastern Asia" = "East Asia",
  "Southeastern Asia" = "Southeast Asia",
  "Southern Asia" = "South Asia",
  "Middle East and Northern Africa" = "Middle East and North Africa")

# Apply the mapping to consolidate region names
whr2015_2022$region <- mapvalues(whr2015_2022$region, from = names(region_name_mapping), to = region_name_mapping)

# Define the correct region for each country based on the standardized assignments
correct_regions <- c(
  "Armenia" = "Central and Eastern Europe",
  "Australia" = "Australia and New Zealand",
  "Taiwan" = "East Asia",
  "Belize" = "Latin America and Caribbean",
  "Hong Kong" = "East Asia",
  "Somalia" = "Sub-Saharan Africa",
  "Namibia" = "Sub-Saharan Africa",
  "South Sudan" = "Sub-Saharan Africa",
  "Trinidad and Tobago" = "Latin America and Caribbean",
  "North Cyprus" = "Western Asia or Europe",
  "Macedonia" = "Central and Eastern Europe",
  "Gambia" = "Sub-Saharan Africa",
  "Luxembourg" = "Western Europe",
  "Czech Republic" = "Central and Eastern Europe",
  "Guatemala" = "Latin America and Caribbean",
  "Kuwait" = "Middle East and North Africa",
  "Belarus" = "Central and Eastern Europe",
  "Turkmenistan" = "Central and Eastern Europe",
  "Libya" = "Middle East and North Africa",
  "Azerbaijan" = "Central and Eastern Europe",
  "Liberia" = "Sub-Saharan Africa",
  "Congo" = "Sub-Saharan Africa",
  "Niger" = "Sub-Saharan Africa",
  "Comoros" = "Sub-Saharan Africa",
  "Palestine" = "Middle East and North Africa",
  "Swaziland" = "Sub-Saharan Africa",
  "Madagascar" = "Sub-Saharan Africa",
  "Chad" = "Sub-Saharan Africa",
  "Yemen" = "Middle East and North Africa",
  "Mauritania" = "Sub-Saharan Africa",
  "Lesotho" = "Sub-Saharan Africa",
  "Botswana" = "Sub-Saharan Africa",
  "Rwanda" = "Sub-Saharan Africa",
  "Canada" = "North America",
  "Georgia" = "Central and Eastern Europe",
  "Kazakhstan" = "Central and Eastern Europe",
  "Kyrgyzstan" = "Central and Eastern Europe",
  "Moldova" = "Central and Eastern Europe",
  "New Zealand" = "Australia and New Zealand",
  "Russia" = "Central and Eastern Europe",
  "Tajikistan" = "Central and Eastern Europe",
  "Ukraine" = "Central and Eastern Europe",
  "United States" = "North America",
  "Uzbekistan" = "Central and Eastern Europe",
  "Northern Cyprus" = "Western Europe"
)

# Update the region for each country in the dataset
for (country in names(correct_regions)) {
  whr2015_2022[whr2015_2022$country == country, "region"] <- correct_regions[country]
}

# View the updated data
glimpse(whr2015_2022)
## Rows: 1,229
## Columns: 11
## $ happiness_rank              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ country                     <chr> "Switzerland", "Iceland", "Denmark", "Norw…
## $ region                      <chr> "Western Europe", "Western Europe", "Weste…
## $ happiness_score             <dbl> 7.587, 7.561, 7.527, 7.522, 7.427, 7.406, …
## $ economy_gdp_per_capita      <dbl> 1.39651, 1.30232, 1.32548, 1.45900, 1.3262…
## $ family_social_support       <dbl> 1.34951, 1.40223, 1.36058, 1.33095, 1.3226…
## $ health_life_expectancy      <dbl> 0.94143, 0.94784, 0.87464, 0.88521, 0.9056…
## $ freedom                     <dbl> 0.66557, 0.62877, 0.64938, 0.66973, 0.6329…
## $ trust_government_corruption <dbl> 0.41978, 0.14145, 0.48357, 0.36503, 0.3295…
## $ generosity                  <dbl> 0.29678, 0.43630, 0.34139, 0.34699, 0.4581…
## $ year                        <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, …
# Import additional data and clean names
world_data_2023 <- read_csv(here::here("data/world-data-2023.csv")) %>%
  janitor::clean_names()

# Create a function that converts percentage strings to numeric values
convert_percent_to_numeric <- function(column) {
  if (is.character(column)) {
    # Remove the "%" sign and convert to numeric
    column <- str_remove_all(column, "%")
    column <- as.numeric(str_replace_all(column, ",", "."))
  }
  column
}

# Apply the conversion function to all columns that end with "_percent"
world_data_2023 <- world_data_2023 %>%
  mutate(
    across(
      .cols = c(ends_with("_percent"), ends_with("_rate")),
      .fns = ~convert_percent_to_numeric(.x)
    )
  ) %>%
  # Divide columns ending with "_percent" or "_rate" by 100
  mutate(
    across(
      .cols = c(ends_with("_percent"), ends_with("_rate")),
      .fns = ~ .x / 100
    )
  )

# Replace commas with dots and remove asterisks from all character columns
world_data_2023 <- world_data_2023 %>%
  mutate(across(where(is.character), ~str_replace_all(.x, ",", ".")))

# Remove asterisks from all character columns (if any)
world_data_2023 <- world_data_2023 %>%
  mutate(across(where(is.character), ~str_replace_all(.x, "\\*", "")))

# Create 2022 dataset
whr2022 <- whr2015_2022 %>%
  filter(year == 2022)

# Perform the left join
joined_data <- left_join(whr2022, world_data_2023, by = "country")

# Add CO2 emissions and Urban population columns
joined_data <- joined_data %>%
  mutate(
    co2_emissions_per_capita = case_when(
      is.na(co2_emissions) | is.na(population) | population == 0 ~ NA_real_,
      TRUE ~ co2_emissions / population
    ),
    urban_population_per_capita = case_when(
      is.na(urban_population) | is.na(population) | population == 0 ~ NA_real_,
      TRUE ~ urban_population / population
    )
  )
  1. Data exploration:
skim(whr2015_2022)
Data summary
Name whr2015_2022
Number of rows 1229
Number of columns 11
_______________________
Column type frequency:
character 2
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 4 24 0 165 0
region 0 1 9 28 0 10 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
happiness_rank 0 1 77.47 44.47 1.0 39.00 77.00 116.00 158.00 ▇▇▇▇▇
happiness_score 0 1 5.43 1.12 2.4 4.58 5.41 6.22 7.84 ▁▅▇▇▃
economy_gdp_per_capita 0 1 0.98 0.43 0.0 0.67 1.01 1.30 2.21 ▃▅▇▃▁
family_social_support 0 1 1.03 0.33 0.0 0.83 1.07 1.27 1.64 ▁▂▆▇▅
health_life_expectancy 0 1 0.61 0.24 0.0 0.44 0.64 0.79 1.14 ▂▅▇▇▂
freedom 0 1 0.44 0.15 0.0 0.34 0.46 0.56 0.74 ▁▃▆▇▃
trust_government_corruption 0 1 0.13 0.11 0.0 0.06 0.10 0.16 0.59 ▇▃▁▁▁
generosity 0 1 0.20 0.12 0.0 0.12 0.19 0.26 0.84 ▇▇▂▁▁
year 0 1 2018.45 2.28 2015.0 2016.00 2018.00 2020.00 2022.00 ▇▃▇▃▇

0.2 Data visualization:

# Create dataframe with data
country_data <- data.frame(
  country=whr2015_2022$country,
  value=whr2015_2022$happiness_score)

# Define the colors for the low, mid, and high values
low_color <- "#FF9999"  # Softer red
mid_color <- "#FFFF99"  # Softer yellow
high_color <- "#99CC99" # Softer green

# Create the color palette function
cols <- colorRampPalette(c(low_color, mid_color, high_color))

# Use cols function to generate the number of colors we need
palette_colors <- cols(length(whr2015_2022))

# Countries to Map function
capture.output(n <- invisible(joinCountryData2Map(country_data, 
                                   joinCode="NAME", 
                                   nameJoinColumn="country")), file='NUL')

# Output plot in pdf
pdf("world_happiness_map.pdf", width = 10, height = 7)
mapCountryData(n, 
               nameColumnToPlot="value", 
               mapTitle="Most unhappy countries are located in Sub-Saharan Africa and South-East Asia\nWorld Map for Happiness Score 2015-2022",
               colourPalette=palette_colors, 
               oceanCol = "#F0F8FF", 
               missingCountryCol = "#CCCCCCCC",
               addLegend = TRUE, 
               aspect = 1.1, 
               borderCol = "Black", 
               lwd =.1)

legend("bottom",  # Adjust position as needed
       legend=c("Low", "Medium", "High"),  # Example categories
       fill=c(low_color, mid_color, high_color),  # Corresponding colors
       title="Happiness Score",  # Title of the legend
       cex=0.8)  # Adjust text size as needed
capture.output(dev.off(), file='NUL')

# Output plot in R console
mapCountryData(n, 
               nameColumnToPlot="value", 
               mapTitle="Most unhappy countries are located in Sub-Saharan Africa and South-East Asia\nWorld Map for Happiness Score 2015-2022",
               colourPalette=palette_colors, 
               oceanCol = "#F0F8FF", 
               missingCountryCol = "#CCCCCCCC",
               addLegend = TRUE, 
               aspect = 1.1, 
               borderCol = "Black", 
               lwd =.1)

legend("bottom",  # Adjust position as needed
       legend=c("Low", "Medium", "High"),  # Example categories
       fill=c(low_color, mid_color, high_color),  # Corresponding colors
       title="Happiness Score",  # Title of the legend
       cex=0.8)  # Adjust text size as needed

# Select top 10 and bottom 10 countries based on happiness score
top10_bottom10_countries <- whr2022 %>%
  arrange(desc(happiness_score)) %>%
  slice(c(1:10, (n()-9):n()))

# Plotting
ggplot(top10_bottom10_countries) +
  geom_point(aes(x = economy_gdp_per_capita, 
                 y = happiness_score, 
                 size = happiness_score, 
                 colour = factor(region),
                 alpha = 0.85)) +
  scale_size_continuous(range = c(2, 15)) +
  geom_vline(xintercept = 1.4, colour = "#f7347a", linetype = "longdash") + 
  geom_hline(yintercept = 5, colour = "#f7347a", linetype = "longdash") +
  geom_text(aes(x = economy_gdp_per_capita, y = happiness_score, label = country), 
            hjust = "left", 
            vjust = "bottom", 
            check_overlap = TRUE, 
            size = 3) +
  theme(legend.position = "none") +
  labs(title = "There is a high correlation between higher GDP per capita and higher \nhappiness score, but there can also be lower happiness in higher \nGDP per capita countries",
       subtitle = "Happiness relative to GDP per capita for Top 5 and Bottom 5 countries in 2022",
       x = "GDP per capita",
       y = "Happiness score") +
  annotate("text", x = 0.83, y = 5.2, family = "Helvetica", size = 2.7, color = "gray20",
           label = "Lower GDP per capita") +
  annotate("text", x = 1.95, y = 5.2, family = "Helvetica", size = 2.7, color = "gray20",
           label = "Higher GDP per capita") +
  annotate("text", x = 1.53, y = 2.3, family = "Helvetica", size = 2.7, color = "gray20",
           label = "Lower Happiness") +
  annotate("text", x = 1.53, y = 8, family = "Helvetica", size = 2.7, color = "gray20",
           label = "Higher Happiness")

# Getting top 10 countries
whr2015_2022_top10 <- whr2022 %>%
  slice_max(happiness_score, n = 10) %>%
  mutate(cat = 'top_10', 
         country_rank = rank(-happiness_score),
         country_label = paste0(country, ' (', country_rank, ')'))

# Getting bottom 10 countries
whr2015_2022_bottom10 <- whr2022 %>%
  mutate(country_rank = rank(happiness_score),
         country_label = paste0(country, ' (', country_rank, ')')) %>%
  slice_min(happiness_score, n = 10) %>%
  mutate(cat = 'bottom_10')

# Plotting top 10 happiest countries 
top_10 <- ggplot(whr2015_2022_top10, aes(x = reorder(country_label, happiness_score))) + 
  geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
  geom_chicklet(aes(y = happiness_score, fill = happiness_score), width = 0.5, radius = grid::unit(5, "pt")) +
  geom_text(aes(y = happiness_score), label = round(whr2015_2022_top10$happiness_score, 2), nudge_y = 0.4, size = 3) + 
  scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
  scale_fill_gradient2(low = 'black', high = '#818aeb', mid = 'white', midpoint = 5) + 
  coord_flip() +
  labs(y="Best possible life = 10", x = '',
       title="Top 10 Happiest Countries in 2022",
       subtitle="8 of the happiest countries present in Europe",
       caption="Source: The World Happiness Report 2022") + 
  theme_ipsum(grid = '')  +
  theme(plot.title = element_text(size=15),
        plot.subtitle = element_text(size = 12),
        plot.caption = element_text(size = 10),
        axis.title.x = element_text(size= 10, color = '#555955'),
        axis.text.y = element_text(size = 10, color = 'black'),
        axis.text.x = element_blank(),
        legend.position = 'None')

# Plotting 10 saddest countries
bottom_10 <- ggplot(whr2015_2022_bottom10, aes(x = reorder(country_label, -happiness_score))) +
  geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
  geom_chicklet(aes(y = happiness_score, fill = happiness_score), width = 0.5, radius = grid::unit(5, "pt")) +
  geom_text(aes(y = happiness_score), label = round(whr2015_2022_bottom10$happiness_score, 2), nudge_y = 0.4, size = 3) + 
  scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
  scale_fill_gradient2(low = '#074040', high = '#4cc2c2', mid = 'white', midpoint = 5) + 
  coord_flip() +
  labs(y="Best possible life = 10", x = '',
       title="Top 10 Saddest Countries in 2022",
       subtitle="Ordered from saddest to less sad",
       caption="Source: The World Happiness Report 2022") + 
  theme_ipsum(grid = '') +
  theme(plot.title = element_text(size=15),
        plot.subtitle = element_text(size = 12),
        plot.caption = element_text(size = 10),
        axis.title.x = element_text(size= 10, color = '#555955'),
        axis.text.y = element_text(size = 10, color = 'black'),
        axis.text.x = element_blank(),
        legend.position = 'None')

# Displaying plots side by side
top_10 + bottom_10

# Creating a new variable for sorted regions
whr2022_sorted <- whr2022 %>%
  group_by(region) %>%
  mutate(avg_happiness = mean(happiness_score)) %>%
  ungroup() %>%
  mutate(region_sorted = reorder(region, avg_happiness))


# ggplot code
ggplot(whr2022_sorted, aes(x = region_sorted, y = happiness_score, fill = region_sorted)) +
  geom_beeswarm(aes(color = region_sorted), size = 2.5, alpha = 0.8) +
  geom_boxplot(aes(group = region_sorted), alpha = 0.3, outlier.shape = NA) + 
  labs(title = "Middle East and North African countries have the highest disparity in \nHappiness score",
       subtitle = "Country-wise Happiness Trends in World Regions",
       x = "Region",
       y = "Happiness Score") +
  geom_hline(yintercept = 5, color = "#f7347a", linetype = "longdash") +
  theme_classic() +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, size = 9)) +
  scale_x_discrete(labels = scales::wrap_format(10)) +  
  scale_fill_brewer(palette = "Spectral") +
  scale_color_brewer(palette = "Spectral")

# ggplotly code
region_level <- ggplot(whr2022_sorted, aes(x = region_sorted, y = happiness_score, fill = region_sorted, text = country)) +
  geom_beeswarm(aes(color = region_sorted, alpha = 1)) +
  labs(title = "Country-wise happiness trends in world regions",
       subtitle = "Middle East and North African countries have the highest disparity in Happiness score",
       x = "Region",
       y = "Happiness score") +
  geom_hline(yintercept = 5, colour = "#f7347a", linetype = "longdash") +
  theme_classic() +
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, hjust = 1, size = 8)) +
  scale_x_discrete(labels = wrap_format(10)) +  
  scale_fill_brewer(palette = "Spectral") +
  scale_color_brewer(palette = "Spectral") +
  geom_boxplot(aes(alpha = 2))

# Convert to ggplotly with tooltips
ggplotly(region_level, tooltip = c("country", "happiness_score"))%>%
  layout(title = list(text = paste0('Middle East and North Africa has the highest disparity in Happiness',
                                    '<br>',
                                    '<sup>',
                                    'Country-wise happiness trends in world regions',
                                    '</sup>')))
# Calculate the correlation coefficients
cor_co2 <- cor(joined_data$happiness_score, log10(joined_data$co2_emissions_per_capita), use = "complete.obs")
cor_urban <- cor(joined_data$happiness_score, joined_data$urban_population_per_capita, use = "complete.obs")

# Scatter plot for Happiness Score vs CO2 Emissions Per Capita (Log Scale)
ggplot(joined_data, aes(x = co2_emissions_per_capita, y = happiness_score, size = co2_emissions)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  scale_x_log10() +
  labs(
    title = "Higher CO2 Emissions Per Capita is correlated with Higher Happiness",
    subtitle = paste("Happiness Score relative to CO2 Emissions Per Capita (Log Scale), r =", round(cor_co2, 2)),
    x = "CO2 Emissions Per Capita (Log Scale)",
    y = "Happiness Score",
    size = "Absolute CO2 Emissions"
  ) +
  theme_classic()

# Calculate correlation coefficients
cor_life_expectancy <- cor(joined_data$happiness_score, joined_data$life_expectancy, use = "complete.obs")

# Create a scatter plot for Happiness Score vs Life Expectancy
ggplot(joined_data, aes(x = life_expectancy, y = happiness_score)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Longer Life Expectancy is correlated with higher Happiness",
    subtitle = paste("Happiness Score relative to Life Expectancy, r =", round(cor_life_expectancy, 2)),
    x = "Life Expectancy",
    y = "Happiness Score"
  ) +
  theme_minimal()

# Population density plot
ggplot(joined_data, aes(x = density_p_km2, y = happiness_score, size = urban_population)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Population Density is inconclusively linked to Happiness",
    subtitle = paste("Happiness Score relative to Population Density, r =", round(cor_urban, 2)),
    x = "Population density",
    y = "Happiness Score",
    size = "Absolute Urban Population"
  ) +
  theme_classic() +
  theme(legend.key.size = unit(0.5, "lines"))+
  xlim(0, 500)

# Scatter plot for Happiness Score vs Urban Population Per Capita
ggplot(joined_data, aes(x = urban_population_per_capita, y = happiness_score, size = urban_population)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Urban Population Per Capita is positively correlated to Happiness",
    subtitle = paste("Happiness Score relative to Urban Population Per Capita, r =", round(cor_urban, 2)),
    x = "Urban Population Per Capita",
    y = "Happiness Score",
    size = "Absolute Urban Population"
  ) +
  theme_classic() +
  theme(legend.key.size = unit(0.5, "lines"))

# Aggregate data by year
yearly_data <- aggregate(happiness_score ~ year, whr2015_2022, mean)

# Convert to a time series object
ts_data <- ts(yearly_data$happiness_score, start = min(yearly_data$year), end = max(yearly_data$year), frequency = 1)

# Fit an ARIMA model
fit <- auto.arima(ts_data)

# Forecast for the next 2 years (2023 and 2024)
forecasted_values <- forecast(fit, h = 2)

# Convert the forecast object to a data frame
forecast_df <- data.frame(
  year = c(yearly_data$year, (max(yearly_data$year) + 1):(max(yearly_data$year) + 2)),
  value = c(fitted(fit), forecasted_values$mean),
  lower = c(rep(NA, length(fitted(fit))), forecasted_values$lower[, "80%"]),
  upper = c(rep(NA, length(fitted(fit))), forecasted_values$upper[, "80%"])
)

# Create the plot with ggplot2
ggplot(forecast_df, aes(x = year, y = value)) +
  geom_line(size = 1, color = "darkblue") +
  geom_point(color = "darkblue") +
  geom_line(size = 1, data = forecast_df[length(yearly_data$year) + 1:nrow(forecast_df), ], color = "darkgrey") +
  geom_point(data = forecast_df[length(yearly_data$year) + 1:nrow(forecast_df), ], color = "grey") +
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey", alpha = 0.2) +
  scale_x_continuous(breaks = forecast_df$year) +
  labs(title = "The overall worldwide happiness score increased from 2015 to 2022",
       subtitle = "Worldwide happiness score fluctuations over the years",
       x = "Year", y = "Happiness Score") +
  theme_minimal() +
  theme(plot.title = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title.position = "plot")

# Aggregate data by region and year
region_yearly_data <- whr2015_2022 %>%
  dplyr::group_by(region, year) %>%
  dplyr::summarize(avg_happiness_score = mean(happiness_score, na.rm = TRUE)) %>%
  ungroup()

# Calculate overall average happiness score for each region
region_avg_scores <- region_yearly_data %>%
  dplyr::group_by(region) %>%
  dplyr::summarize(overall_avg_happiness = mean(avg_happiness_score)) %>%
  arrange(desc(overall_avg_happiness)) %>%
  ungroup()

# Create a color mapping based on these averages using the "Spectral" palette
num_regions <- nrow(region_avg_scores)
spectral_colors <- colorRampPalette(brewer.pal(11, "Spectral"))(num_regions)
names(spectral_colors) <- region_avg_scores$region

# Adjusting the order of the 'region' factor according to happiness score
region_yearly_data$region <- factor(region_yearly_data$region, levels = region_avg_scores$region)

# Create the trend plot with custom colors
ggplot(region_yearly_data, aes(x = year, y = avg_happiness_score, group = region, color = region)) +
  geom_line(size = 1) +
  geom_point() +
  scale_color_manual(values = spectral_colors) +
  theme_minimal() +
  labs(title = "The happiness score for each region remained relatively stable \nfrom 2015 to 2022",
       subtitle = "Change in happiness by region over time",
       x = "Year",
       y = "Average Happiness Score",
       color = "Region") 

# Melting data for stacked bar plot
melted_whr <- melt(whr2015_2022, id.vars = c("year", "country"), 
                   measure.vars = c("economy_gdp_per_capita", "family_social_support", 
                                    "health_life_expectancy", "freedom", 
                                    "trust_government_corruption", "generosity"))

# Determine the number of unique factors (variables) for the color palette
num_factors <- length(unique(melted_whr$variable))

# Create a color mapping using the "Spectral" palette
spectral_colors <- colorRampPalette(brewer.pal(11, "Spectral"))(num_factors)

# Define custom names for the legends
legend_names <- c("Economy (GDP per Capita)", "Family/Social Support", "Health (Life Expectancy)", 
                  "Freedom", "Trust (Government Corruption)", "Generosity")

# Stacked bar chart with "Spectral" color set and renamed legends
ggplot(melted_whr, aes(x = as.factor(year), y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = spectral_colors, labels = legend_names) +  # Apply "Spectral" colors with custom legend names
  theme_minimal() +
  labs(title = "Economy and social support seem to be the two main happiness contributors",
       subtitle = "Contribution of happiness factors over the years",
       x = "Year", y = "Factor Contribution",
       fill = "Happiness Factor")+
  theme(plot.title = element_text(size = 13),
                                  plot.title.position = "plot")

# Density ridge plot
ggplot(whr2015_2022, aes(x = happiness_score, y = factor(year), fill = region)) +
  geom_density_ridges(scale = 1) +
  labs(title = "Western Europe always ranks first while Sub-Saharan Africa and \nSouth Asia report lower scores", x = "Happiness Score", y = "Year",
       subtitle = "Distribution of Happiness Scores of Different Countries by Region and Year",
       fill = "Region") +
  scale_fill_viridis_d() +
  theme(
    panel.background = element_blank(), 
    axis.text.y = element_text(hjust = 1),
    axis.title.x = element_text(vjust = 0.5, hjust = 0.5),
    axis.title.y = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) 

# 2D Density plot
ggplot(whr2015_2022, aes(x = economy_gdp_per_capita, y = happiness_score)) +
  geom_density_2d_filled() +
  labs(title = "A positive correlation exists with the highest density observed \nat mid-range values",
       subtitle = "2D Density Plot of GDP per Capita relative to Happiness Score",
       caption = "Note: Density levels represent the concentration of countries at GDP and happiness score values.",
       x="GDP per capita",
       y="Happiness Score",
       fill = "Density Range") +
  theme_minimal() +
  theme(
    plot.caption = element_text(face = "italic", color = "grey70",hjust = 0))

# Calculate mean life expectancy for each region
means <- whr2015_2022 %>%
  group_by(region) %>%
  summarise(mean_life_expectancy = mean(health_life_expectancy, na.rm = TRUE)) %>%
  arrange(desc(mean_life_expectancy))

# Create a gradient of 10 pastel colors
colors <- colorRampPalette(c("#B3FFB3", "#FFE0B3", "#FFB3B3"))(10)

# Map these colors to the regions based on their mean life expectancy
color_mapping <- setNames(colors, means$region)

# Create the violin plot
ggplot(whr2015_2022, aes(x = reorder(region, happiness_score, FUN = mean), y = health_life_expectancy, fill = region)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, fill = "white") +
  scale_fill_manual(values = color_mapping) +
  labs(title = "Higher life expectancy and uniform distributions are in regions \nwith higher happiness scores",
       subtitle = "Violin Plot of Life Expectancy by Region",
       x = "Region",
       y = "Life expectancy") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.margin = margin(l = 40, unit = "pt"),
    legend.position = "none"
  )

# Categories trust in multiple bins
categorise_trust <- whr2015_2022 %>% 
  mutate(trust_category = case_when(
    trust_government_corruption >= 0.0 & trust_government_corruption <= 0.1 ~ "Very Low",
    trust_government_corruption > 0.1 & trust_government_corruption <= 0.2 ~ "Low",
    trust_government_corruption > 0.2 & trust_government_corruption <= 0.3 ~ "Medium",
    trust_government_corruption > 0.3 & trust_government_corruption <= 0.4 ~ "High",
    trust_government_corruption > 0.4 ~ "Very High",
    TRUE ~ NA_character_
  )) %>% 
  filter(!is.na(trust_category)) %>% 
  mutate(trust_category = factor(trust_category, levels = c("Very Low", "Low", "Medium", "High", "Very High")))

ggplot(categorise_trust, aes(x = trust_category, y = happiness_score, color = trust_category)) +
  geom_point(size=2) +
  labs(title = "Countries with high trust in government seem to be mostly happy, \nwhile there is more disparity with lower levels of trust",
       subtitle= "Happiness Score relative to Trust in Government",
       x = "Trust in Government",
       y = "Happiness Score") +
  theme_minimal() +
  scale_color_manual(values = c(
    "Very Low" = "#FFB3B3",  # Pastel red for Very Low trust
    "Low" = "#FFD1B3",       # Pastel orange for Low trust
    "Medium" = "#FFE0B3",    # Lighter orange for Medium trust
    "High" = "#D9FFB3",      # Pale green for High trust
    "Very High" = "#B3FFB3"  # Pastel green for Very High trust
  )) +
  theme(legend.position = "none")